{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "file = './data/train_subset_1000000.csv'\n",
    "df = pd.read_csv(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--All data:999999\n",
      "--1 data:160219\n",
      "--0 data:839780\n",
      "--0 VS 1 => 5.24:1\n"
     ]
    }
   ],
   "source": [
    "print(f'--All data:{df.id.count()}')\n",
    "y_1_nums = df[df[\"click\"] == 1].id.count()\n",
    "y_0_nums = df[df[\"click\"] == 0].id.count()\n",
    "print(f'--1 data:{y_1_nums}')\n",
    "print(f'--0 data:{y_0_nums}')\n",
    "print(f'--0 VS 1 => {round(y_0_nums/y_1_nums,2)}:1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>9.376309e+18</td>\n",
       "      <td>5.236908e+18</td>\n",
       "      <td>9.984920e+12</td>\n",
       "      <td>4.846660e+18</td>\n",
       "      <td>9.834382e+18</td>\n",
       "      <td>1.373053e+19</td>\n",
       "      <td>1.844670e+19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>click</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.602192e-01</td>\n",
       "      <td>3.668094e-01</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hour</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.410210e+07</td>\n",
       "      <td>1.493255e+00</td>\n",
       "      <td>1.410210e+07</td>\n",
       "      <td>1.410210e+07</td>\n",
       "      <td>1.410210e+07</td>\n",
       "      <td>1.410210e+07</td>\n",
       "      <td>1.410210e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C1</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.005088e+03</td>\n",
       "      <td>1.156928e+00</td>\n",
       "      <td>1.001000e+03</td>\n",
       "      <td>1.005000e+03</td>\n",
       "      <td>1.005000e+03</td>\n",
       "      <td>1.005000e+03</td>\n",
       "      <td>1.012000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>banner_pos</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>2.299222e-01</td>\n",
       "      <td>4.646270e-01</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>7.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>device_type</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.025540e+00</td>\n",
       "      <td>4.538988e-01</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>5.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>device_conn_type</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>2.233602e-01</td>\n",
       "      <td>6.671590e-01</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>5.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C14</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.826220e+04</td>\n",
       "      <td>3.510366e+03</td>\n",
       "      <td>3.750000e+02</td>\n",
       "      <td>1.570700e+04</td>\n",
       "      <td>1.925100e+04</td>\n",
       "      <td>2.115300e+04</td>\n",
       "      <td>2.170500e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C15</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>3.189658e+02</td>\n",
       "      <td>1.945291e+01</td>\n",
       "      <td>1.200000e+02</td>\n",
       "      <td>3.200000e+02</td>\n",
       "      <td>3.200000e+02</td>\n",
       "      <td>3.200000e+02</td>\n",
       "      <td>1.024000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C16</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>5.649555e+01</td>\n",
       "      <td>3.654696e+01</td>\n",
       "      <td>2.000000e+01</td>\n",
       "      <td>5.000000e+01</td>\n",
       "      <td>5.000000e+01</td>\n",
       "      <td>5.000000e+01</td>\n",
       "      <td>1.024000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C17</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>2.041031e+03</td>\n",
       "      <td>4.412010e+02</td>\n",
       "      <td>1.120000e+02</td>\n",
       "      <td>1.722000e+03</td>\n",
       "      <td>2.161000e+03</td>\n",
       "      <td>2.420000e+03</td>\n",
       "      <td>2.497000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C18</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.452260e+00</td>\n",
       "      <td>1.362637e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>3.000000e+00</td>\n",
       "      <td>3.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C19</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.907794e+02</td>\n",
       "      <td>2.734394e+02</td>\n",
       "      <td>3.300000e+01</td>\n",
       "      <td>3.500000e+01</td>\n",
       "      <td>3.900000e+01</td>\n",
       "      <td>2.970000e+02</td>\n",
       "      <td>1.835000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C20</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>4.550590e+04</td>\n",
       "      <td>4.984381e+04</td>\n",
       "      <td>-1.000000e+00</td>\n",
       "      <td>-1.000000e+00</td>\n",
       "      <td>-1.000000e+00</td>\n",
       "      <td>1.000840e+05</td>\n",
       "      <td>1.002480e+05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C21</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>6.993616e+01</td>\n",
       "      <td>3.851384e+01</td>\n",
       "      <td>1.300000e+01</td>\n",
       "      <td>4.300000e+01</td>\n",
       "      <td>6.100000e+01</td>\n",
       "      <td>7.900000e+01</td>\n",
       "      <td>1.950000e+02</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     count          mean           std           min  \\\n",
       "id                999999.0  9.376309e+18  5.236908e+18  9.984920e+12   \n",
       "click             999999.0  1.602192e-01  3.668094e-01  0.000000e+00   \n",
       "hour              999999.0  1.410210e+07  1.493255e+00  1.410210e+07   \n",
       "C1                999999.0  1.005088e+03  1.156928e+00  1.001000e+03   \n",
       "banner_pos        999999.0  2.299222e-01  4.646270e-01  0.000000e+00   \n",
       "device_type       999999.0  1.025540e+00  4.538988e-01  0.000000e+00   \n",
       "device_conn_type  999999.0  2.233602e-01  6.671590e-01  0.000000e+00   \n",
       "C14               999999.0  1.826220e+04  3.510366e+03  3.750000e+02   \n",
       "C15               999999.0  3.189658e+02  1.945291e+01  1.200000e+02   \n",
       "C16               999999.0  5.649555e+01  3.654696e+01  2.000000e+01   \n",
       "C17               999999.0  2.041031e+03  4.412010e+02  1.120000e+02   \n",
       "C18               999999.0  1.452260e+00  1.362637e+00  0.000000e+00   \n",
       "C19               999999.0  1.907794e+02  2.734394e+02  3.300000e+01   \n",
       "C20               999999.0  4.550590e+04  4.984381e+04 -1.000000e+00   \n",
       "C21               999999.0  6.993616e+01  3.851384e+01  1.300000e+01   \n",
       "\n",
       "                           25%           50%           75%           max  \n",
       "id                4.846660e+18  9.834382e+18  1.373053e+19  1.844670e+19  \n",
       "click             0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00  \n",
       "hour              1.410210e+07  1.410210e+07  1.410210e+07  1.410210e+07  \n",
       "C1                1.005000e+03  1.005000e+03  1.005000e+03  1.012000e+03  \n",
       "banner_pos        0.000000e+00  0.000000e+00  0.000000e+00  7.000000e+00  \n",
       "device_type       1.000000e+00  1.000000e+00  1.000000e+00  5.000000e+00  \n",
       "device_conn_type  0.000000e+00  0.000000e+00  0.000000e+00  5.000000e+00  \n",
       "C14               1.570700e+04  1.925100e+04  2.115300e+04  2.170500e+04  \n",
       "C15               3.200000e+02  3.200000e+02  3.200000e+02  1.024000e+03  \n",
       "C16               5.000000e+01  5.000000e+01  5.000000e+01  1.024000e+03  \n",
       "C17               1.722000e+03  2.161000e+03  2.420000e+03  2.497000e+03  \n",
       "C18               0.000000e+00  1.000000e+00  3.000000e+00  3.000000e+00  \n",
       "C19               3.500000e+01  3.900000e+01  2.970000e+02  1.835000e+03  \n",
       "C20              -1.000000e+00 -1.000000e+00  1.000840e+05  1.002480e+05  \n",
       "C21               4.300000e+01  6.100000e+01  7.900000e+01  1.950000e+02  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe().T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 999999 entries, 0 to 999998\n",
      "Data columns (total 24 columns):\n",
      "id                  999999 non-null float64\n",
      "click               999999 non-null int64\n",
      "hour                999999 non-null int64\n",
      "C1                  999999 non-null int64\n",
      "banner_pos          999999 non-null int64\n",
      "site_id             999999 non-null object\n",
      "site_domain         999999 non-null object\n",
      "site_category       999999 non-null object\n",
      "app_id              999999 non-null object\n",
      "app_domain          999999 non-null object\n",
      "app_category        999999 non-null object\n",
      "device_id           999999 non-null object\n",
      "device_ip           999999 non-null object\n",
      "device_model        999999 non-null object\n",
      "device_type         999999 non-null int64\n",
      "device_conn_type    999999 non-null int64\n",
      "C14                 999999 non-null int64\n",
      "C15                 999999 non-null int64\n",
      "C16                 999999 non-null int64\n",
      "C17                 999999 non-null int64\n",
      "C18                 999999 non-null int64\n",
      "C19                 999999 non-null int64\n",
      "C20                 999999 non-null int64\n",
      "C21                 999999 non-null int64\n",
      "dtypes: float64(1), int64(14), object(9)\n",
      "memory usage: 183.1+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "##接下来对特征进行处理，先将类别特征进行编码\n",
    "#针对类型类的特征，先进行编码，编码之前构建字典\n",
    "from sklearn import preprocessing\n",
    "\n",
    "def label_encode(field,df):\n",
    "    dic = []\n",
    "    df_field = df[field]\n",
    "    list_field = df_field.tolist()\n",
    "\n",
    "    #构建field字典\n",
    "    for i in list_field:\n",
    "        if i not in dic:\n",
    "            dic.append(i)\n",
    "\n",
    "    label_field = preprocessing.LabelEncoder()\n",
    "    label_field.fit(dic)\n",
    "\n",
    "    df_field_enconde_tmp = label_field.transform(df_field)\n",
    "    df_field_enconde = pd.DataFrame(df_field_enconde_tmp, index=df.index, columns=[(field+'_enconde')])\n",
    "    return df_field_enconde"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# site_id             999999 non-null object\n",
    "# site_domain         999999 non-null object\n",
    "# site_category       999999 non-null object\n",
    "# app_id              999999 non-null object\n",
    "# app_domain          999999 non-null object\n",
    "# app_category        999999 non-null object\n",
    "# device_id           999999 non-null object\n",
    "# device_ip           999999 non-null object\n",
    "# device_model        999999 non-null object\n",
    "df_site_id_enconde = label_encode('site_id',df)\n",
    "df_site_domain_enconde = label_encode('site_domain',df)\n",
    "df_site_category_enconde = label_encode('site_category',df)\n",
    "df_app_id_enconde = label_encode('app_id',df)\n",
    "df_app_domain_enconde = label_encode('app_domain',df)\n",
    "df_app_category_enconde = label_encode('app_category',df)\n",
    "df_device_id_enconde = label_encode('device_id',df)\n",
    "df_device_ip_enconde = label_encode('device_ip',df)\n",
    "df_device_model_enconde = label_encode('device_model',df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#拼接特征回去\n",
    "# id                  999999 non-null float64\n",
    "# click               999999 non-null int64\n",
    "# hour                999999 non-null int64\n",
    "# C1                  999999 non-null int64\n",
    "# banner_pos          999999 non-null int64\n",
    "# site_id             999999 non-null object\n",
    "# site_domain         999999 non-null object\n",
    "# site_category       999999 non-null object\n",
    "# app_id              999999 non-null object\n",
    "# app_domain          999999 non-null object\n",
    "# app_category        999999 non-null object\n",
    "# device_id           999999 non-null object\n",
    "# device_ip           999999 non-null object\n",
    "# device_model        999999 non-null object\n",
    "# device_type         999999 non-null int64\n",
    "# device_conn_type    999999 non-null int64\n",
    "# C14                 999999 non-null int64\n",
    "# C15                 999999 non-null int64\n",
    "# C16                 999999 non-null int64\n",
    "# C17                 999999 non-null int64\n",
    "# C18                 999999 non-null int64,\n",
    "# C19                 999999 non-null int64\n",
    "# C20                 999999 non-null int64\n",
    "# C21                 999999 non-null int64\n",
    "pd_input = pd.concat([df[['click','banner_pos','device_type','device_conn_type'\n",
    "                          ,'C1','C14','C15','C16','C17','C18','C19','C20','C21']]\n",
    "                      ,df_site_id_enconde\n",
    "                      ,df_site_domain_enconde\n",
    "                      ,df_site_category_enconde\n",
    "                      ,df_app_id_enconde\n",
    "                      ,df_app_domain_enconde\n",
    "                      ,df_app_category_enconde\n",
    "                      ,df_device_id_enconde\n",
    "                      ,df_device_ip_enconde\n",
    "                      ,df_device_model_enconde], axis=1) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#进行特征的简单分析\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "#特征与y的分布，构建一个散点分布函数\n",
    "def draw_scatter(x, y, xLabel):\n",
    "    plt.figure(figsize=(10,5))\n",
    "    plt.scatter(x, y)\n",
    "    plt.title('%s VS Gender' %xLabel)\n",
    "    plt.xlabel(xLabel)\n",
    "    plt.ylabel('Gender')\n",
    "    plt.yticks(range(0, 2, 1)) # 纵轴起点，最大值，间隔, 对应的就是gender\n",
    "    plt.grid()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>click</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>0.160219</td>\n",
       "      <td>0.366809</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>banner_pos</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>0.229922</td>\n",
       "      <td>0.464627</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>device_type</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.025540</td>\n",
       "      <td>0.453899</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>device_conn_type</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>0.223360</td>\n",
       "      <td>0.667159</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C1</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1005.088166</td>\n",
       "      <td>1.156928</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>1005.0</td>\n",
       "      <td>1005.0</td>\n",
       "      <td>1005.0</td>\n",
       "      <td>1012.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C14</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>18262.199732</td>\n",
       "      <td>3510.366393</td>\n",
       "      <td>375.0</td>\n",
       "      <td>15707.0</td>\n",
       "      <td>19251.0</td>\n",
       "      <td>21153.0</td>\n",
       "      <td>21705.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C15</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>318.965807</td>\n",
       "      <td>19.452907</td>\n",
       "      <td>120.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>1024.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C16</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>56.495552</td>\n",
       "      <td>36.546962</td>\n",
       "      <td>20.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>1024.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C17</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>2041.030657</td>\n",
       "      <td>441.200951</td>\n",
       "      <td>112.0</td>\n",
       "      <td>1722.0</td>\n",
       "      <td>2161.0</td>\n",
       "      <td>2420.0</td>\n",
       "      <td>2497.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C18</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1.452260</td>\n",
       "      <td>1.362637</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C19</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>190.779412</td>\n",
       "      <td>273.439422</td>\n",
       "      <td>33.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>297.0</td>\n",
       "      <td>1835.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C20</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>45505.902746</td>\n",
       "      <td>49843.814296</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>100084.0</td>\n",
       "      <td>100248.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C21</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>69.936165</td>\n",
       "      <td>38.513837</td>\n",
       "      <td>13.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>61.0</td>\n",
       "      <td>79.0</td>\n",
       "      <td>195.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>site_id_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>934.602801</td>\n",
       "      <td>530.239200</td>\n",
       "      <td>0.0</td>\n",
       "      <td>260.0</td>\n",
       "      <td>1110.0</td>\n",
       "      <td>1110.0</td>\n",
       "      <td>2074.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>site_domain_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1401.135846</td>\n",
       "      <td>512.175721</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1205.0</td>\n",
       "      <td>1568.0</td>\n",
       "      <td>1773.0</td>\n",
       "      <td>2029.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>site_category_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>8.352048</td>\n",
       "      <td>6.740715</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>app_id_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>1847.034069</td>\n",
       "      <td>575.646933</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2044.0</td>\n",
       "      <td>2127.0</td>\n",
       "      <td>2127.0</td>\n",
       "      <td>2308.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>app_domain_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>68.299279</td>\n",
       "      <td>24.073600</td>\n",
       "      <td>0.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>155.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>app_category_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>2.277576</td>\n",
       "      <td>5.048106</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>device_id_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>53131.399758</td>\n",
       "      <td>10855.161812</td>\n",
       "      <td>0.0</td>\n",
       "      <td>55297.0</td>\n",
       "      <td>55297.0</td>\n",
       "      <td>55297.0</td>\n",
       "      <td>83429.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>device_ip_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>157051.264390</td>\n",
       "      <td>89922.831824</td>\n",
       "      <td>0.0</td>\n",
       "      <td>81069.0</td>\n",
       "      <td>156585.0</td>\n",
       "      <td>235055.0</td>\n",
       "      <td>313000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>device_model_enconde</th>\n",
       "      <td>999999.0</td>\n",
       "      <td>2318.054640</td>\n",
       "      <td>1251.152447</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1265.0</td>\n",
       "      <td>2459.0</td>\n",
       "      <td>3369.0</td>\n",
       "      <td>4580.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          count           mean           std     min      25%  \\\n",
       "click                  999999.0       0.160219      0.366809     0.0      0.0   \n",
       "banner_pos             999999.0       0.229922      0.464627     0.0      0.0   \n",
       "device_type            999999.0       1.025540      0.453899     0.0      1.0   \n",
       "device_conn_type       999999.0       0.223360      0.667159     0.0      0.0   \n",
       "C1                     999999.0    1005.088166      1.156928  1001.0   1005.0   \n",
       "C14                    999999.0   18262.199732   3510.366393   375.0  15707.0   \n",
       "C15                    999999.0     318.965807     19.452907   120.0    320.0   \n",
       "C16                    999999.0      56.495552     36.546962    20.0     50.0   \n",
       "C17                    999999.0    2041.030657    441.200951   112.0   1722.0   \n",
       "C18                    999999.0       1.452260      1.362637     0.0      0.0   \n",
       "C19                    999999.0     190.779412    273.439422    33.0     35.0   \n",
       "C20                    999999.0   45505.902746  49843.814296    -1.0     -1.0   \n",
       "C21                    999999.0      69.936165     38.513837    13.0     43.0   \n",
       "site_id_enconde        999999.0     934.602801    530.239200     0.0    260.0   \n",
       "site_domain_enconde    999999.0    1401.135846    512.175721     0.0   1205.0   \n",
       "site_category_enconde  999999.0       8.352048      6.740715     0.0      2.0   \n",
       "app_id_enconde         999999.0    1847.034069    575.646933     0.0   2044.0   \n",
       "app_domain_enconde     999999.0      68.299279     24.073600     0.0     73.0   \n",
       "app_category_enconde   999999.0       2.277576      5.048106     0.0      0.0   \n",
       "device_id_enconde      999999.0   53131.399758  10855.161812     0.0  55297.0   \n",
       "device_ip_enconde      999999.0  157051.264390  89922.831824     0.0  81069.0   \n",
       "device_model_enconde   999999.0    2318.054640   1251.152447     0.0   1265.0   \n",
       "\n",
       "                            50%       75%       max  \n",
       "click                       0.0       0.0       1.0  \n",
       "banner_pos                  0.0       0.0       7.0  \n",
       "device_type                 1.0       1.0       5.0  \n",
       "device_conn_type            0.0       0.0       5.0  \n",
       "C1                       1005.0    1005.0    1012.0  \n",
       "C14                     19251.0   21153.0   21705.0  \n",
       "C15                       320.0     320.0    1024.0  \n",
       "C16                        50.0      50.0    1024.0  \n",
       "C17                      2161.0    2420.0    2497.0  \n",
       "C18                         1.0       3.0       3.0  \n",
       "C19                        39.0     297.0    1835.0  \n",
       "C20                        -1.0  100084.0  100248.0  \n",
       "C21                        61.0      79.0     195.0  \n",
       "site_id_enconde          1110.0    1110.0    2074.0  \n",
       "site_domain_enconde      1568.0    1773.0    2029.0  \n",
       "site_category_enconde       6.0      19.0      20.0  \n",
       "app_id_enconde           2127.0    2127.0    2308.0  \n",
       "app_domain_enconde         73.0      73.0     155.0  \n",
       "app_category_enconde        0.0       3.0      22.0  \n",
       "device_id_enconde       55297.0   55297.0   83429.0  \n",
       "device_ip_enconde      156585.0  235055.0  313000.0  \n",
       "device_model_enconde     2459.0    3369.0    4580.0  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd_input.describe().T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "##处理过的数据保存下来\n",
    "pd_input.to_csv('./out_put/encode_data.csv', header=True, index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
